## Regression models of placement outcomes
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.1  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(broom)
library(forcats)
library(rstanarm)
## Loading required package: Rcpp
## rstanarm (Version 2.18.2, packaged: 2018-11-08 22:19:38 UTC)
## - Do not expect the default priors to remain the same in future rstanarm versions.
## Thus, R scripts should specify priors explicitly, even if they are just the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
## - Plotting theme set to bayesplot::theme_default().
options(mc.cores = parallel::detectCores() - 2)
## bayesplot makes itself the default theme
theme_set(theme_minimal())

library(tictoc)
library(assertthat)
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
## 
##     has_name
## Suppress messages when generating HTML file
knitr::opts_chunk$set(message = FALSE)

source('../R/predictions.R')
source('../R/posterior_estimates.R')
data_folder = '../data/'
output_folder = '../plots/'

# cluster_distances = read_csv(str_c(data_folder, 
#                                    '00_k9distances_2019-03-15.csv')) %>% 
#     count(cluster = cluster, average_distance = avgDist) %>% 
#     mutate(cluster = as.character(cluster))
# 
# ggplot(cluster_distances, aes(cluster, scale(average_distance))) +
#     geom_label(aes(label = n, fill = n, size = n), color = 'white')

load(str_c(data_folder, '01_parsed.Rdata'))
univ_df = read_rds(str_c(data_folder, '02_univ_net_stats.rds')) #%>% 
    # left_join(cluster_distances)

individual_df = individual_df %>%
    left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
    ## Use the canonical names from univ_df
    select(-placing_univ) %>%
    ## Drop NAs
    # filter(complete.cases(.))
    filter_at(vars('permanent', 'aos_category', 
                   'graduation_year', 'prestige', 
                   'community', 'cluster_lvl4',
                   'gender', 'frac_w', 
                   'frac_high_prestige', 'total_placements'), 
              all_vars(negate(is.na)(.))) %>%
    rename(cluster = cluster_lvl4) %>% 
    mutate(perc_w = 100*frac_w, 
           perc_high_prestige = 100*frac_high_prestige)

## Variables to consider: aos_category; graduation_year; placement_year; prestige; out_centrality; cluster; community; placing_univ_id; gender; country; perc_w; total_placements

## Giant pairs plot/correlogram ----
## perc_high_prestige, out_centrality, and prestige are all tightly correlated
## All other pairs have low to moderate correlation
individual_df %>% 
    select(permanent, aos_category, aos_diversity, perc_high_prestige,
           graduation_year, placement_year, prestige, 
           in_centrality, out_centrality, community, 
           cluster, #average_distance,
           gender, country, perc_w, 
           total_placements) %>% 
    mutate_if(negate(is.numeric), function(x) as.integer(as.factor(x))) %>% 
    mutate_at(vars(in_centrality, out_centrality), log10) %>% 
    # GGally::ggpairs()
    cor() %>% 
    as_tibble(rownames = 'Var1') %>% 
    gather(key = 'Var2', value = 'cor', -Var1) %>% 
    ggplot(aes(Var1, Var2, fill = cor)) +
    geom_tile() +
    geom_text(aes(label = round(cor, digits = 2)), 
              color = 'white') +
    scale_fill_gradient2()

## No indication that AOS diversity has any effect
ggplot(individual_df, aes(aos_diversity, 1*permanent)) + 
    geom_point() +
    geom_smooth(method = 'loess')

## And not for fraction of PhDs awarded to women women, either
ggplot(individual_df, aes(frac_w, 1*permanent)) +
    geom_point() +
    geom_smooth(method = 'loess')

## Descriptive statistics ----
## Individual-level variables (all discrete)
individual_df %>%
    select(permanent, aos_category, 
           graduation_year, placement_year, 
           gender) %>%
    gather(key = variable, value = value) %>%
    count(variable, value) %>% 
    mutate(variable = str_replace_all(variable, '_', ' ')) %>% 
    ggplot(aes(fct_rev(value), n, group = variable)) +
    geom_col(aes(fill = variable), show.legend = FALSE) +
    scale_fill_brewer(palette = 'Set1') +
    xlab('') +
    coord_flip() +
    facet_wrap(vars(variable), scales = 'free')
## Warning: attributes are not identical across measure variables;
## they will be dropped

ggsave(str_c(output_folder, '03_descriptive_1.png'), 
       height = 2*2, width = 2*3, scale = 1.5)

## Program-level categorical
individual_df %>%
    select(prestige, country, 
           community, cluster) %>%
    gather(key = variable, value = value) %>%
    count(variable, value) %>% 
    ggplot(aes(fct_rev(value), n, group = variable)) +
    geom_col(aes(fill = variable), show.legend = FALSE) +
    xlab('') +
    coord_flip() +
    facet_wrap(vars(variable), scales = 'free')

ggsave(str_c(output_folder, '03_descriptive_2.png'), 
       height = 2*2, width = 2*2, scale = 1.5)

## Program-level continuous variables
# individual_df %>%
#     select(frac_w, total_placements, perm_placement_rate) %>%
#     gather(key = variable, value = value) %>%
#     group_by(variable) %>%
#     summarize_at(vars(value), 
#                  funs(min, max, mean, median, sd), 
#                  na.rm = TRUE)

program_cont = individual_df %>% 
    mutate(in_centrality = log10(in_centrality)) %>% 
    select(`women share` = frac_w, 
           `total placements` = total_placements, 
           `permanent placement rate` = perm_placement_rate, 
           `AOS diversity (bits)` = aos_diversity,
           `hiring centrality (log10)` = in_centrality) %>% 
    gather(key = variable, value = value)

ggplot(program_cont, aes(value)) +
    geom_density() +
    geom_rug() +
    geom_vline(data = {program_cont %>% 
            group_by(variable) %>% 
            summarize(mean = mean(value))}, 
            aes(xintercept = mean, 
                color = 'mean')) +
    geom_vline(data = {program_cont %>% 
            group_by(variable) %>% 
            summarize(median = median(value))}, 
            aes(xintercept = median, 
                color = 'median')) +
    scale_color_brewer(palette = 'Set1', 
                       name = 'summary\nstatistic') +
    facet_wrap(~ variable, scales = 'free')

ggsave(str_c(output_folder, '03_descriptive_3.png'), 
       height = 2*2, width = 2*3.5, scale = 1.5)


## Model -----
model_file = str_c(data_folder, '03_model.Rds')
if (!file.exists(model_file)) {
    ## ~700 seconds
    tic()
    model = individual_df %>% 
        mutate(prestige = fct_relevel(prestige, 'low-prestige'), 
               country = fct_relevel(country, 'U.S.')) %>% 
        stan_glmer(formula = permanent ~ 
                       (1|aos_category) +
                       gender + 
                       (1|graduation_year) +
                       (1|placement_year) +
                       1 +
                       aos_diversity +
                       (1|community) +
                       (1|cluster) +
                       # average_distance +
                       log10(in_centrality) +
                       total_placements +
                       perc_w +
                       country +
                       prestige,
                   family = 'binomial',
                   ## Priors
                   ## Constant and coefficients
                   prior_intercept = normal(0, .5), ## constant term + random intercepts
                   prior = normal(0, .5),
                   ## error sd
                   prior_aux = exponential(rate = 1, 
                                           autoscale = TRUE),
                   ## random effects covariance
                   prior_covariance =  decov(regularization = 1, 
                                             concentration = 1, 
                                             shape = 1, scale = 1),
                   seed = 1159518215,
                   adapt_delta = .99,
                   chains = 4, iter = 4000)
    toc()
    write_rds(model, model_file)
} else {
    model = read_rds(model_file)
}
## Check ESS and Rhat
## Rhats all look good.  ESS a little low for grad years + some sigmas
model %>%
    summary() %>%
    as.data.frame() %>%
    rownames_to_column('parameter') %>%
    select(parameter, n_eff, Rhat) %>%
    # knitr::kable()
    ggplot(aes(n_eff, Rhat, label = parameter)) +
    geom_point() +
    geom_vline(xintercept = 4000) +
    geom_hline(yintercept = 1.01)

if (require(plotly)) {
    plotly::ggplotly()    
}
## Variables w/ fewer than 3000 effective draws
## covariance on random intercepts; log posterior
model %>% 
    summary() %>% 
    as.data.frame() %>% 
    rownames_to_column('parameter') %>% 
    as_tibble() %>% 
    filter(n_eff < 3000) %>% 
    select(parameter, n_eff)
## # A tibble: 2 x 2
##   parameter                                n_eff
##   <chr>                                    <dbl>
## 1 Sigma[community:(Intercept),(Intercept)]  1968
## 2 log-posterior                             1710
## Check predictions
pp_check(model, nreps = 200)

pp_check(model, nreps = 200, plotfun = 'ppc_bars')

## <https://arxiv.org/pdf/1605.01311.pdf>
pp_check(model, nreps = 200, plotfun = 'ppc_rootogram')

pp_check(model, nreps = 200, plotfun = 'ppc_rootogram', 
         style = 'hanging')

estimates = posterior_estimates(model)

estimates %>% 
    filter(entity != 'intercept', 
           group != 'placement_year') %>% 
    mutate_if(is.numeric, ~ . - 1) %>% 
    ggplot(aes(x = level, y = estimate, 
           ymin = lower, ymax = upper, 
           color = group)) +
    geom_hline(yintercept = 0, linetype = 'dashed') +
    geom_pointrange() + 
    scale_color_viridis_d(name = 'covariate\ngroup') +
    xlab('') + #ylab('') +
    scale_y_continuous(labels = scales::percent_format(), 
                       name = '') +
    coord_flip() +
    facet_wrap(~ entity, scales = 'free')

ggsave(str_c(output_folder, '03_estimates.png'), 
       width = 10, height = 5, 
       scale = 1.5)


sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] plotly_4.8.0     bindrcpp_0.2.2   assertthat_0.2.0 tictoc_1.0      
##  [5] rstanarm_2.18.2  Rcpp_1.0.0       broom_0.5.1      forcats_0.4.0   
##  [9] stringr_1.4.0    dplyr_0.8.0.1    purrr_0.3.1      readr_1.3.1     
## [13] tidyr_0.8.3      tibble_2.0.1     ggplot2_3.1.0    tidyverse_1.2.1 
## 
## loaded via a namespace (and not attached):
##  [1] minqa_1.2.4        colorspace_1.4-0   ggridges_0.5.1    
##  [4] rsconnect_0.8.13   markdown_0.9       base64enc_0.1-3   
##  [7] rstudioapi_0.9.0   rstan_2.18.2       DT_0.5            
## [10] fansi_0.4.0        lubridate_1.7.4    xml2_1.2.0        
## [13] codetools_0.2-16   splines_3.5.1      knitr_1.22        
## [16] shinythemes_1.1.2  bayesplot_1.6.0    jsonlite_1.6      
## [19] nloptr_1.2.1       shiny_1.2.0        compiler_3.5.1    
## [22] httr_1.4.0         backports_1.1.3    Matrix_1.2-16     
## [25] lazyeval_0.2.1     cli_1.0.1          later_0.8.0       
## [28] htmltools_0.3.6    prettyunits_1.0.2  tools_3.5.1       
## [31] igraph_1.2.4       gtable_0.2.0       glue_1.3.1        
## [34] reshape2_1.4.3     cellranger_1.1.0   nlme_3.1-137      
## [37] crosstalk_1.0.0    xfun_0.5           ps_1.3.0          
## [40] lme4_1.1-21        rvest_0.3.2        mime_0.6          
## [43] miniUI_0.1.1.1     gtools_3.8.1       MASS_7.3-51.1     
## [46] zoo_1.8-4          scales_1.0.0       colourpicker_1.0  
## [49] hms_0.4.2          promises_1.0.1     parallel_3.5.1    
## [52] inline_0.3.15      shinystan_2.5.0    RColorBrewer_1.1-2
## [55] yaml_2.2.0         gridExtra_2.3      loo_2.1.0         
## [58] StanHeaders_2.18.1 stringi_1.4.3      highr_0.7         
## [61] dygraphs_1.1.1.6   boot_1.3-20        pkgbuild_1.0.2    
## [64] rlang_0.3.1        pkgconfig_2.0.2    matrixStats_0.54.0
## [67] evaluate_0.13      lattice_0.20-38    bindr_0.1.1       
## [70] rstantools_1.5.1   htmlwidgets_1.3    labeling_0.3      
## [73] processx_3.3.0     tidyselect_0.2.5   plyr_1.8.4        
## [76] magrittr_1.5       R6_2.4.0           generics_0.0.2    
## [79] pillar_1.3.1       haven_2.1.0        withr_2.1.2       
## [82] xts_0.11-2         survival_2.43-3    modelr_0.1.4      
## [85] crayon_1.3.4       utf8_1.1.4         rmarkdown_1.12    
## [88] grid_3.5.1         readxl_1.3.1       data.table_1.12.0 
## [91] callr_3.2.0        threejs_0.3.1      digest_0.6.18     
## [94] xtable_1.8-3       httpuv_1.4.5.1     stats4_3.5.1      
## [97] munsell_0.5.0      viridisLite_0.3.0  shinyjs_1.0